In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.layers import Embedding, LSTM, add, Concatenate, Reshape, concatenate, Bidirectional
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap
plt.rcParams['font.size'] = 12
sns.set_style("dark")
warnings.filterwarnings('ignore')
Image Captioning¶
What is Image Captioning ?
- Image Captioning is the process of generating textual description of an image. It uses both Natural Language Processing and Computer Vision to generate the captions.
- This task lies at the intersection of computer vision and natural language processing. Most image captioning systems use an encoder-decoder framework, where an input image is encoded into an intermediate representation of the information in the image, and then decoded into a descriptive text sequence.
CNNs + RNNs (LSTMs)
- To perform Image Captioning we will require two deep learning models combined into one for the training purpose
- CNNs extract the features from the image of some vector size aka the vector embeddings. The size of these embeddings depend on the type of pretrained network being used for the feature extraction
- LSTMs are used for the text generation process. The image embeddings are concatenated with the word embeddings and passed to the LSTM to generate the next word
- For a more illustrative explanation of this architecture check the Modelling section for a picture representation
In [10]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr8k")
print("Path to dataset files:", path)
Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.13), please consider upgrading to the latest version (0.4.0). Resuming download from 39845888 bytes (1073125275 bytes left)... Resuming download from https://www.kaggle.com/api/v1/datasets/download/adityajn105/flickr8k?dataset_version_number=1 (39845888/1112971163) bytes left.
100%|██████████| 1.04G/1.04G [04:46<00:00, 3.75MB/s]
Extracting files...
Path to dataset files: C:\Users\musaq\.cache\kagglehub\datasets\adityajn105\flickr8k\versions\1
In [39]:
IMAGE_FOLDER = r"C:\Users\musaq\.cache\kagglehub\datasets\adityajn105\flickr8k\versions\1\Images"
CAPTIONS_FILE = r"C:\Users\musaq\.cache\kagglehub\datasets\adityajn105\flickr8k\versions\1\captions.txt"
In [ ]:
data = pd.read_csv(IMAGE_FOLDER + "/../captions.txt")
data.head()
Out[ ]:
| image | caption | |
|---|---|---|
| 0 | 1000268201_693b08cb0e.jpg | A child in a pink dress is climbing up a set o... |
| 1 | 1000268201_693b08cb0e.jpg | A girl going into a wooden building . |
| 2 | 1000268201_693b08cb0e.jpg | A little girl climbing into a wooden playhouse . |
| 3 | 1000268201_693b08cb0e.jpg | A little girl climbing the stairs to her playh... |
| 4 | 1000268201_693b08cb0e.jpg | A little girl in a pink dress going into a woo... |
In [ ]:
def readImage(path, img_size=224):
img = load_img(path, color_mode='rgb', target_size=(img_size, img_size))
img = img_to_array(img)
img = img/255.
return img
def display_images(temp_df):
temp_df = temp_df.reset_index(drop=True)
plt.figure(figsize=(20, 20))
for i in range(15):
plt.subplot(5, 5, i+1)
plt.subplots_adjust(hspace=0.7, wspace=0.3)
img_path = f"{IMAGE_FOLDER}/{temp_df.image[i]}"
image = readImage(img_path)
plt.imshow(image)
plt.title("\n".join(wrap(temp_df.caption[i], 20)))
plt.axis("off")
Visualization¶
- Images and their corresponding captions
In [ ]:
display_images(data.sample(15))
Caption Text Preprocessing Steps¶
- Convert sentences into lowercase
- Remove special characters and numbers present in the text
- Remove extra spaces
- Remove single characters
- Add a starting and an ending tag to the sentences to indicate the beginning and the ending of a sentence
In [ ]:
def text_preprocessing(data):
data['caption'] = data['caption'].apply(lambda x: x.lower())
data['caption'] = data['caption'].apply(lambda x: x.replace("[^A-Za-z]",""))
data['caption'] = data['caption'].apply(lambda x: x.replace("\s+"," "))
data['caption'] = data['caption'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))
data['caption'] = "startseq "+data['caption']+" endseq"
return data
Preprocessed Text¶
In [25]:
data = text_preprocessing(data)
captions = data['caption'].tolist()
captions[:10]
Out[25]:
['startseq child in pink dress is climbing up set of stairs in an entry way endseq', 'startseq girl going into wooden building endseq', 'startseq little girl climbing into wooden playhouse endseq', 'startseq little girl climbing the stairs to her playhouse endseq', 'startseq little girl in pink dress going into wooden cabin endseq', 'startseq black dog and spotted dog are fighting endseq', 'startseq black dog and tri-colored dog playing with each other on the road endseq', 'startseq black dog and white dog with brown spots are staring at each other in the street endseq', 'startseq two dogs of different breeds looking at each other on the road endseq', 'startseq two dogs on pavement moving toward each other endseq']
Tokenization and Encoded Representation¶
- The words in a sentence are separated/tokenized and encoded in a one hot representation
- These encodings are then passed to the embeddings layer to generate word embeddings
In [ ]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions)
images = data['image'].unique().tolist()
nimages = len(images)
split_index = round(0.85*nimages)
train_images = images[:split_index]
val_images = images[split_index:]
train = data[data['image'].isin(train_images)]
test = data[data['image'].isin(val_images)]
train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)
tokenizer.texts_to_sequences([captions[1]])[0]
Out[ ]:
[1, 18, 315, 63, 195, 116, 2]
Image Feature Extraction¶
- DenseNet 201 Architecture is used to extract the features from the images
- Any other pretrained architecture can also be used for extracting features from these images
- Since the Global Average Pooling layer is selected as the final layer of the DenseNet201 model for our feature extraction, our image embeddings will be a vector of size 1920
In [41]:
data = pd.read_csv(CAPTIONS_FILE)
img_size = 224
base = DenseNet201(include_top=False, pooling='avg')
fe = Model(inputs=base.input, outputs=base.output)
features = {}
for image in tqdm(data['image'].unique().tolist()):
try:
img_path = os.path.join(IMAGE_FOLDER, image)
img = load_img(img_path, target_size=(img_size, img_size))
img = img_to_array(img)
img = img / 255.
img = np.expand_dims(img, axis=0)
feature = fe.predict(img, verbose=0)
features[image] = feature
except Exception as e:
print("Error extracting:", image, e)
100%|██████████| 8091/8091 [24:09<00:00, 5.58it/s]
In [42]:
all_images = data['image'].unique().tolist()
missing = [img for img in all_images if img not in features]
print("Missing features:", len(missing))
for image in missing:
try:
img_path = os.path.join(IMAGE_FOLDER, image)
img = load_img(img_path, target_size=(img_size, img_size))
img = img_to_array(img)
img = img / 255.
img = np.expand_dims(img, axis=0)
feature = fe.predict(img, verbose=0)
features[image] = feature
print("Recovered:", image)
except Exception as e:
print("FAILED:", image, e)
print("Final feature count:", len(features))
Missing features: 0 Final feature count: 8091
Data Generation¶
- Since Image Caption model training like any other neural network training is a highly resource utillizing process we cannot load the data into the main memory all at once, and hence we need to generate the data in the required format batch wise
- The inputs will be the image embeddings and their corresonding caption text embeddings for the training process
- The text embeddings are passed word by word for the caption generation during inference time
In [43]:
class CustomDataGenerator(Sequence):
def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer,
vocab_size, max_length, features,shuffle=True):
self.df = df.copy()
self.X_col = X_col
self.y_col = y_col
self.directory = directory
self.batch_size = batch_size
self.tokenizer = tokenizer
self.vocab_size = vocab_size
self.max_length = max_length
self.features = features
self.shuffle = shuffle
self.n = len(self.df)
def on_epoch_end(self):
if self.shuffle:
self.df = self.df.sample(frac=1).reset_index(drop=True)
def __len__(self):
return self.n // self.batch_size
def __getitem__(self,index):
batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size,:]
X1, X2, y = self.__get_data(batch)
return (X1, X2), y
def __get_data(self,batch):
X1, X2, y = list(), list(), list()
images = batch[self.X_col].tolist()
for image in images:
feature = self.features[image][0]
captions = batch.loc[batch[self.X_col]==image, self.y_col].tolist()
for caption in captions:
seq = self.tokenizer.texts_to_sequences([caption])[0]
for i in range(1,len(seq)):
in_seq, out_seq = seq[:i], seq[i]
in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
X1.append(feature)
X2.append(in_seq)
y.append(out_seq)
X1, X2, y = np.array(X1), np.array(X2), np.array(y)
return X1, X2, y
train_generator = CustomDataGenerator(df=train,X_col='image',y_col='caption',batch_size=64,directory=image_path,
tokenizer=tokenizer,vocab_size=vocab_size,max_length=max_length,features=features)
validation_generator = CustomDataGenerator(df=test,X_col='image',y_col='caption',batch_size=64,directory=image_path,
tokenizer=tokenizer,vocab_size=vocab_size,max_length=max_length,features=features)
Modelling¶
- The image embedding representations are concatenated with the first word of sentence ie. starseq and passed to the LSTM network
- The LSTM network starts generating words after each input thus forming a sentence at the end
In [44]:
from tensorflow.keras.utils import plot_model
input1 = Input(shape=(1920,))
input2 = Input(shape=(max_length,))
img_features = Dense(256, activation='relu')(input1)
img_features_reshaped = Reshape((1, 256), input_shape=(256,))(img_features)
sentence_features = Embedding(vocab_size, 256, mask_zero=False)(input2)
merged = concatenate([img_features_reshaped,sentence_features],axis=1)
sentence_features = LSTM(256)(merged)
x = Dropout(0.5)(sentence_features)
x = add([x, img_features])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(vocab_size, activation='softmax')(x)
caption_model = Model(inputs=[input1,input2], outputs=output)
caption_model.compile(loss='categorical_crossentropy',optimizer='adam')
from tensorflow.keras.callbacks import ModelCheckpoint
# Define the model checkpoint
model_name = "model.keras" # Update the extension to .keras
checkpoint = ModelCheckpoint(
model_name,
monitor="val_loss",
mode="min",
save_best_only=True,
save_weights_only=False,
verbose=1
)
earlystopping = EarlyStopping(monitor='val_loss',min_delta = 0, patience = 5, verbose = 1, restore_best_weights=True)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
patience=3,
verbose=1,
factor=0.2,
min_lr=0.00000001)
history = caption_model.fit(
train_generator,
epochs=50,
validation_data=validation_generator,
callbacks=[checkpoint,earlystopping,learning_rate_reduction])
Epoch 1/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 388ms/step - loss: 5.7148 Epoch 1: val_loss improved from None to 4.28454, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 228s 421ms/step - loss: 5.0959 - val_loss: 4.2845 - learning_rate: 0.0010 Epoch 2/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 399ms/step - loss: 4.3362 Epoch 2: val_loss improved from 4.28454 to 3.99680, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 269s 434ms/step - loss: 4.2786 - val_loss: 3.9968 - learning_rate: 0.0010 Epoch 3/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 417ms/step - loss: 4.0488 Epoch 3: val_loss improved from 3.99680 to 3.84010, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 241s 448ms/step - loss: 4.0061 - val_loss: 3.8401 - learning_rate: 0.0010 Epoch 4/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 422ms/step - loss: 3.8458 Epoch 4: val_loss improved from 3.84010 to 3.73999, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 245s 456ms/step - loss: 3.8373 - val_loss: 3.7400 - learning_rate: 0.0010 Epoch 5/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 426ms/step - loss: 3.7164 Epoch 5: val_loss improved from 3.73999 to 3.68348, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 247s 460ms/step - loss: 3.7104 - val_loss: 3.6835 - learning_rate: 0.0010 Epoch 6/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 425ms/step - loss: 3.6117 Epoch 6: val_loss improved from 3.68348 to 3.66124, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 246s 457ms/step - loss: 3.6104 - val_loss: 3.6612 - learning_rate: 0.0010 Epoch 7/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 420ms/step - loss: 3.5224 Epoch 7: val_loss improved from 3.66124 to 3.64358, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 247s 461ms/step - loss: 3.5291 - val_loss: 3.6436 - learning_rate: 0.0010 Epoch 8/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 419ms/step - loss: 3.4494 Epoch 8: val_loss improved from 3.64358 to 3.62544, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 243s 452ms/step - loss: 3.4613 - val_loss: 3.6254 - learning_rate: 0.0010 Epoch 9/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 424ms/step - loss: 3.3864 Epoch 9: val_loss improved from 3.62544 to 3.61507, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 246s 458ms/step - loss: 3.3953 - val_loss: 3.6151 - learning_rate: 0.0010 Epoch 10/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 419ms/step - loss: 3.3339 Epoch 10: val_loss improved from 3.61507 to 3.61181, saving model to model.keras 537/537 ━━━━━━━━━━━━━━━━━━━━ 242s 451ms/step - loss: 3.3483 - val_loss: 3.6118 - learning_rate: 0.0010 Epoch 11/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 433ms/step - loss: 3.2865 Epoch 11: val_loss did not improve from 3.61181 537/537 ━━━━━━━━━━━━━━━━━━━━ 249s 465ms/step - loss: 3.3003 - val_loss: 3.6374 - learning_rate: 0.0010 Epoch 12/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 410ms/step - loss: 3.2425 Epoch 12: val_loss did not improve from 3.61181 537/537 ━━━━━━━━━━━━━━━━━━━━ 237s 441ms/step - loss: 3.2567 - val_loss: 3.6462 - learning_rate: 0.0010 Epoch 13/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 409ms/step - loss: 3.2063 Epoch 13: val_loss did not improve from 3.61181 Epoch 13: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026. 537/537 ━━━━━━━━━━━━━━━━━━━━ 237s 441ms/step - loss: 3.2202 - val_loss: 3.6673 - learning_rate: 0.0010 Epoch 14/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 423ms/step - loss: 3.1140 Epoch 14: val_loss did not improve from 3.61181 537/537 ━━━━━━━━━━━━━━━━━━━━ 245s 457ms/step - loss: 3.1147 - val_loss: 3.6657 - learning_rate: 2.0000e-04 Epoch 15/50 537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 429ms/step - loss: 3.0936 Epoch 15: val_loss did not improve from 3.61181 537/537 ━━━━━━━━━━━━━━━━━━━━ 248s 463ms/step - loss: 3.0907 - val_loss: 3.6787 - learning_rate: 2.0000e-04 Epoch 15: early stopping Restoring model weights from the end of the best epoch: 10.
In [45]:
print(len(os.listdir(IMAGE_FOLDER)))
print(data['image'].nunique())
8091 8091
Learning Curve¶
- The model has clearly overfit, possibly due to less amount of data
- We can tackle this problem in two ways
- Train the model on a larger dataset Flickr40k
- Attention Models
In [46]:
plt.figure(figsize=(20,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
Caption Generation Utility Functions¶
- Utility functions to generate the captions of input images at the inference time.
- Here the image embeddings are passed along with the first word, followed by which the text embedding of each new word is passed to generate the next word
In [54]:
import pickle
# Save the tokenizer
with open("tokenizer.pkl", "wb") as f:
pickle.dump(tokenizer, f)
# Save the feature extractor model
fe.save("feature_extractor.keras")
In [55]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import matplotlib.pyplot as plt
import pickle
# load save files
model_path = "model.keras"
tokenizer_path = "tokenizer.pkl"
feature_extractor_path = "feature_extractor.keras"
def generate_and_display_caption(image_path, model_path, tokenizer_path, feature_extractor_path, max_length=34, img_size=224):
# Load the trained models and tokenizer
caption_model = load_model(model_path)
feature_extractor = load_model(feature_extractor_path)
with open(tokenizer_path, "rb") as f:
tokenizer = pickle.load(f)
# Preprocess the image
img = load_img(image_path, target_size=(img_size, img_size))
img = img_to_array(img) / 255.0 # Normalize pixel values
img = np.expand_dims(img, axis=0)
image_features = feature_extractor.predict(img, verbose=0) # Extract image features
# Generate the caption
in_text = "startseq"
for i in range(max_length):
sequence = tokenizer.texts_to_sequences([in_text])[0]
sequence = pad_sequences([sequence], maxlen=max_length)
yhat = caption_model.predict([image_features, sequence], verbose=0)
yhat_index = np.argmax(yhat)
word = tokenizer.index_word.get(yhat_index, None)
if word is None:
break
in_text += " " + word
if word == "endseq":
break
caption = in_text.replace("startseq", "").replace("endseq", "").strip()
# Display the image with the generated caption
img = load_img(image_path, target_size=(img_size, img_size))
plt.figure(figsize=(8, 8))
plt.imshow(img)
plt.axis('off')
plt.title(caption, fontsize=16, color='blue')
plt.show()
In [56]:
import os
print(os.listdir())
['.gitignore', 'feature_extractor.keras', 'flickr8k-image-captioning-using-cnns-lstms (1).ipynb', 'flickr8k-image-captioning-using-cnns-lstms.ipynb', 'img.png', 'img_1.png', 'img_2.png', 'img_3.png', 'main.py', 'model.keras', 'README.md', 'tokenizer.pkl', 'uploaded_image.jpg']
In [58]:
# Example usage
image_path = "img_1.png"
generate_and_display_caption(image_path, "model.keras", "tokenizer.pkl", "feature_extractor.keras")
In [59]:
image_path = "img_2.png"
generate_and_display_caption(image_path, "model.keras", "tokenizer.pkl", "feature_extractor.keras")
In [62]:
image_path = "img.png"
generate_and_display_caption(image_path, "model.keras", "tokenizer.pkl", "feature_extractor.keras")
In [ ]: